In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
#import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

from sklearn.ensemble import RandomForestClassifier
In [1]:
jupyter nbconvert --to html "churn_Modelling assignment.ipynb"
  Cell In[1], line 1
    jupyter nbconvert --to html "churn_Modelling assignment.ipynb"
            ^
SyntaxError: invalid syntax
In [9]:
import matplotlib
matplotlib.use('TkAgg')  
import matplotlib.pyplot as plt
In [10]:
#load dataset
df = pd.read_csv('promotion_dataset.csv')
df.head()
Out[10]:
EmployeeNo Division Qualification Gender Channel_of_Recruitment Trainings_Attended Year_of_birth Last_performance_score Year_of_recruitment Targets_met Previous_Award Training_score_average State_Of_Origin Foreign_schooled Marital_Status Past_Disciplinary_Action Previous_IntraDepartmental_Movement No_of_previous_employers Promoted_or_Not
0 YAK/S/00001 Commercial Sales and Marketing MSc MBA and PhD Female Direct Internal process 2 1986 12.5 2011 1 0 41 ANAMBRA No Married No No 0 0
1 YAK/S/00002 Customer Support and Field Operations First Degree or HND Male Agency and others 2 1991 12.5 2015 0 0 52 ANAMBRA Yes Married No No 0 0
2 YAK/S/00003 Commercial Sales and Marketing First Degree or HND Male Direct Internal process 2 1987 7.5 2012 0 0 42 KATSINA Yes Married No No 0 0
3 YAK/S/00004 Commercial Sales and Marketing First Degree or HND Male Agency and others 3 1982 2.5 2009 0 0 42 NIGER Yes Single No No 1 0
4 YAK/S/00006 Information and Strategy First Degree or HND Male Direct Internal process 3 1990 7.5 2012 0 0 77 AKWA IBOM Yes Married No No 1 0
In [11]:
# getting information about dataset
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38312 entries, 0 to 38311
Data columns (total 19 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   EmployeeNo                           38312 non-null  object 
 1   Division                             38312 non-null  object 
 2   Qualification                        36633 non-null  object 
 3   Gender                               38312 non-null  object 
 4   Channel_of_Recruitment               38312 non-null  object 
 5   Trainings_Attended                   38312 non-null  int64  
 6   Year_of_birth                        38312 non-null  int64  
 7   Last_performance_score               38312 non-null  float64
 8   Year_of_recruitment                  38312 non-null  int64  
 9   Targets_met                          38312 non-null  int64  
 10  Previous_Award                       38312 non-null  int64  
 11  Training_score_average               38312 non-null  int64  
 12  State_Of_Origin                      38312 non-null  object 
 13  Foreign_schooled                     38312 non-null  object 
 14  Marital_Status                       38312 non-null  object 
 15  Past_Disciplinary_Action             38312 non-null  object 
 16  Previous_IntraDepartmental_Movement  38312 non-null  object 
 17  No_of_previous_employers             38312 non-null  int64  
 18  Promoted_or_Not                      38312 non-null  int64  
dtypes: float64(1), int64(8), object(10)
memory usage: 5.6+ MB
In [12]:
# more info on rows and collumns
df.shape
Out[12]:
(38312, 19)
In [13]:
#information about missing data
round((df.isnull().sum() / df.shape[0]) * 100, 2)
Out[13]:
EmployeeNo                             0.00
Division                               0.00
Qualification                          4.38
Gender                                 0.00
Channel_of_Recruitment                 0.00
Trainings_Attended                     0.00
Year_of_birth                          0.00
Last_performance_score                 0.00
Year_of_recruitment                    0.00
Targets_met                            0.00
Previous_Award                         0.00
Training_score_average                 0.00
State_Of_Origin                        0.00
Foreign_schooled                       0.00
Marital_Status                         0.00
Past_Disciplinary_Action               0.00
Previous_IntraDepartmental_Movement    0.00
No_of_previous_employers               0.00
Promoted_or_Not                        0.00
dtype: float64
In [14]:
# handling missing data by filling with mode
df['Qualification'] = df['Qualification'].fillna(df['Qualification'].mode()[0])
In [15]:
#crosscheck missing data filled
round((df.isnull().sum() / df.shape[0]) * 100, 2)
Out[15]:
EmployeeNo                             0.0
Division                               0.0
Qualification                          0.0
Gender                                 0.0
Channel_of_Recruitment                 0.0
Trainings_Attended                     0.0
Year_of_birth                          0.0
Last_performance_score                 0.0
Year_of_recruitment                    0.0
Targets_met                            0.0
Previous_Award                         0.0
Training_score_average                 0.0
State_Of_Origin                        0.0
Foreign_schooled                       0.0
Marital_Status                         0.0
Past_Disciplinary_Action               0.0
Previous_IntraDepartmental_Movement    0.0
No_of_previous_employers               0.0
Promoted_or_Not                        0.0
dtype: float64
In [16]:
# identify collumn data types
cat_col = [col for col in df.columns if df[col].dtype == 'object']
num_col = [col for col in df.columns if df[col].dtype != 'object']

print('Categorical columns:', cat_col)
print('Numerical columns:', num_col)
Categorical columns: ['EmployeeNo', 'Division', 'Qualification', 'Gender', 'Channel_of_Recruitment', 'State_Of_Origin', 'Foreign_schooled', 'Marital_Status', 'Past_Disciplinary_Action', 'Previous_IntraDepartmental_Movement']
Numerical columns: ['Trainings_Attended', 'Year_of_birth', 'Last_performance_score', 'Year_of_recruitment', 'Targets_met', 'Previous_Award', 'Training_score_average', 'No_of_previous_employers', 'Promoted_or_Not']
In [17]:
#count unique data for categorical data types
df[cat_col].nunique()
Out[17]:
EmployeeNo                             38312
Division                                   9
Qualification                              3
Gender                                     2
Channel_of_Recruitment                     3
State_Of_Origin                           37
Foreign_schooled                           2
Marital_Status                             3
Past_Disciplinary_Action                   2
Previous_IntraDepartmental_Movement        2
dtype: int64
In [18]:
#statistical summary of dataset
df.describe()
Out[18]:
Trainings_Attended Year_of_birth Last_performance_score Year_of_recruitment Targets_met Previous_Award Training_score_average No_of_previous_employers Promoted_or_Not
count 38312.000000 38312.000000 38312.000000 38312.000000 38312.000000 38312.000000 38312.000000 38312.000000 38312.000000
mean 2.253680 1986.209334 7.698959 2013.139695 0.352996 0.023152 55.366465 1.040953 0.084595
std 0.609443 7.646047 3.744135 4.261451 0.477908 0.150388 13.362741 1.235738 0.278282
min 2.000000 1950.000000 0.000000 1982.000000 0.000000 0.000000 31.000000 0.000000 0.000000
25% 2.000000 1982.000000 5.000000 2012.000000 0.000000 0.000000 43.000000 0.000000 0.000000
50% 2.000000 1988.000000 7.500000 2014.000000 0.000000 0.000000 52.000000 1.000000 0.000000
75% 2.000000 1992.000000 10.000000 2016.000000 1.000000 0.000000 68.000000 1.000000 0.000000
max 11.000000 2001.000000 12.500000 2018.000000 1.000000 1.000000 91.000000 6.000000 1.000000
In [19]:
# generate profiling report 
from ydata_profiling import ProfileReport
df.profile_report()
Upgrade to ydata-sdk

Improve your data and profiling with ydata-sdk, featuring data quality scoring, redundancy detection, outlier identification, text validation, and synthetic data generation.

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
100%|██████████| 19/19 [00:00<00:00, 27.90it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[19]:

In [20]:
#correlation analysis
df.corr(numeric_only=True)
Out[20]:
Trainings_Attended Year_of_birth Last_performance_score Year_of_recruitment Targets_met Previous_Award Training_score_average No_of_previous_employers Promoted_or_Not
Trainings_Attended 1.000000 0.078710 -0.062042 0.056215 -0.044789 -0.007409 0.041065 0.000796 -0.024345
Year_of_birth 0.078710 1.000000 -0.175572 0.654666 0.025337 0.013627 0.048390 -0.003117 0.017991
Last_performance_score -0.062042 -0.175572 1.000000 -0.190333 0.276350 0.026587 0.057836 -0.005428 0.119690
Year_of_recruitment 0.056215 0.654666 -0.190333 1.000000 0.076910 0.041995 0.037477 -0.003550 0.012287
Targets_met -0.044789 0.025337 0.276350 0.076910 1.000000 0.092934 0.077201 -0.003308 0.224518
Previous_Award -0.007409 0.013627 0.026587 0.041995 0.092934 1.000000 0.072360 0.003887 0.201434
Training_score_average 0.041065 0.048390 0.057836 0.037477 0.077201 0.072360 1.000000 0.008194 0.178448
No_of_previous_employers 0.000796 -0.003117 -0.005428 -0.003550 -0.003308 0.003887 0.008194 1.000000 0.001690
Promoted_or_Not -0.024345 0.017991 0.119690 0.012287 0.224518 0.201434 0.178448 0.001690 1.000000
In [21]:
sns.countplot(x='Promoted_or_Not', data=df, hue='Promoted_or_Not', palette='Set1')
plt.title('promotion distribution')
plt.show()
C:\Users\HP\AppData\Local\Temp\ipykernel_8608\2511900159.py:3: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  plt.show()
In [22]:
plt.boxplot(df['Training_score_average'], vert=False)
plt.ylabel('Variable')
plt.xlabel('Targets_met')
plt.title('Box Plot')
plt.show()
C:\Users\HP\AppData\Local\Temp\ipykernel_8608\3987866815.py:5: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  plt.show()
In [23]:
plt.boxplot(df['Last_performance_score'], vert=False)
plt.ylabel('Variable')
plt.xlabel('Last_performance_score')
plt.title('Box Plot')
plt.show()
C:\Users\HP\AppData\Local\Temp\ipykernel_8608\3335382509.py:5: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  plt.show()
In [24]:
plt.boxplot(df['Year_of_recruitment'], vert=False)
plt.ylabel('Variable')
plt.xlabel('Year_of_recruitment')
plt.title('corr Plot in search of outliers')
plt.show()
C:\Users\HP\AppData\Local\Temp\ipykernel_8608\3332865500.py:5: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  plt.show()
In [25]:
# feature engineering - creating new feature 'Age' from 'Year_of_birth' to drop redundant data
df["Age"] = 2025 - df["Year_of_birth"]
df.head()
Out[25]:
EmployeeNo Division Qualification Gender Channel_of_Recruitment Trainings_Attended Year_of_birth Last_performance_score Year_of_recruitment Targets_met Previous_Award Training_score_average State_Of_Origin Foreign_schooled Marital_Status Past_Disciplinary_Action Previous_IntraDepartmental_Movement No_of_previous_employers Promoted_or_Not Age
0 YAK/S/00001 Commercial Sales and Marketing MSc MBA and PhD Female Direct Internal process 2 1986 12.5 2011 1 0 41 ANAMBRA No Married No No 0 0 39
1 YAK/S/00002 Customer Support and Field Operations First Degree or HND Male Agency and others 2 1991 12.5 2015 0 0 52 ANAMBRA Yes Married No No 0 0 34
2 YAK/S/00003 Commercial Sales and Marketing First Degree or HND Male Direct Internal process 2 1987 7.5 2012 0 0 42 KATSINA Yes Married No No 0 0 38
3 YAK/S/00004 Commercial Sales and Marketing First Degree or HND Male Agency and others 3 1982 2.5 2009 0 0 42 NIGER Yes Single No No 1 0 43
4 YAK/S/00006 Information and Strategy First Degree or HND Male Direct Internal process 3 1990 7.5 2012 0 0 77 AKWA IBOM Yes Married No No 1 0 35
In [ ]:
#separating features and target variable
X = df[['Division', 'Qualification', 'Gender', 'Channel_of_Recruitment','Age', 'Trainings_Attended', 'Last_performance_score', 'Year_of_recruitment', 'Targets_met', 'Previous_Award', 'Training_score_average', 'Foreign_schooled', 'Past_Disciplinary_Action', 'Previous_IntraDepartmental_Movement', 'No_of_previous_employers']]
Y = df['Promoted_or_Not']
In [27]:
# scaling numerical features
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

x1 = X.copy()

scaler = MinMaxScaler(feature_range=(0, 1))

num_col_ = [col for col in X.columns if X[col].dtype != 'object']

num_cols = x1.select_dtypes(include=['int64', 'float64']).columns
x1[num_col_] = scaler.fit_transform(x1[num_col_])

# encoding categorical features
cat_cols = x1.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in cat_cols:
    x1[col] = le.fit_transform(x1[col])
x1.head()
Out[27]:
Division Qualification Gender Channel_of_Recruitment Age Trainings_Attended Last_performance_score Year_of_recruitment Targets_met Previous_Award Training_score_average Foreign_schooled Past_Disciplinary_Action Previous_IntraDepartmental_Movement No_of_previous_employers
0 1 1 0 1 0.294118 0.000000 1.0 0.805556 1.0 0.0 0.166667 0 0 0 0.000000
1 2 0 1 0 0.196078 0.000000 1.0 0.916667 0.0 0.0 0.350000 1 0 0 0.000000
2 1 0 1 1 0.274510 0.000000 0.6 0.833333 0.0 0.0 0.183333 1 0 0 0.000000
3 1 0 1 0 0.372549 0.111111 0.2 0.750000 0.0 0.0 0.183333 1 0 0 0.166667
4 4 0 1 1 0.215686 0.111111 0.6 0.833333 0.0 0.0 0.766667 1 0 0 0.166667
In [28]:
# standardizing numerical features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(x1)
print(X_standardized[:5])
[[-0.83158796  1.41479513 -1.53339317  0.99209997  0.02737841 -0.41625517
   1.28229978 -0.5021114   1.35384256 -0.15395043 -1.07512768 -3.24810652
  -0.06290405 -0.32218928 -0.84238477]
 [-0.41929682 -0.61751078  0.65214846 -0.86183301 -0.62656278 -0.41625517
   1.28229978  0.43654831 -0.73863832 -0.15395043 -0.25193251  0.30787168
  -0.06290405 -0.32218928 -0.84238477]
 [-0.83158796 -0.61751078  0.65214846  0.99209997 -0.10340983 -0.41625517
  -0.05313941 -0.26744648 -0.73863832 -0.15395043 -1.00029176  0.30787168
  -0.06290405 -0.32218928 -0.84238477]
 [-0.83158796 -0.61751078  0.65214846 -0.86183301  0.55053137  1.22460994
  -1.3885786  -0.97144126 -0.73863832 -0.15395043 -1.00029176  0.30787168
  -0.06290405 -0.32218928 -0.03314114]
 [ 0.40528546 -0.61751078  0.65214846  0.99209997 -0.49577454  1.22460994
  -0.05313941 -0.26744648 -0.73863832 -0.15395043  1.61896561  0.30787168
  -0.06290405 -0.32218928 -0.03314114]]
In [29]:
# preparing for train-test split
categorical_features_X = X.select_dtypes(include=['object']).columns.tolist()
numerical_features_X = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features_X),
        ("num", StandardScaler(), numerical_features_X)
    ]
)
In [30]:
# train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
In [31]:
# building random forest model
rf_model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        random_state=42,
        class_weight='balanced'  # handle imbalanced classes
    ))
])
# Fit
rf_model.fit(X_train, Y_train)
Out[31]:
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Division', 'Qualification',
                                                   'Gender',
                                                   'Channel_of_Recruitment',
                                                   'Foreign_schooled',
                                                   'Past_Disciplinary_Action',
                                                   'Previous_IntraDepartmental_Movement']),
                                                 ('num', StandardScaler(),
                                                  ['Age', 'Trainings_Attended',
                                                   'Last_performance_score',
                                                   'Year_of_recruitment',
                                                   'Targets_met',
                                                   'Previous_Award',
                                                   'Training_score_average',
                                                   'No_of_previous_employers'])])),
                ('model',
                 RandomForestClassifier(class_weight='balanced',
                                        n_estimators=300, random_state=42))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Division', 'Qualification',
                                                   'Gender',
                                                   'Channel_of_Recruitment',
                                                   'Foreign_schooled',
                                                   'Past_Disciplinary_Action',
                                                   'Previous_IntraDepartmental_Movement']),
                                                 ('num', StandardScaler(),
                                                  ['Age', 'Trainings_Attended',
                                                   'Last_performance_score',
                                                   'Year_of_recruitment',
                                                   'Targets_met',
                                                   'Previous_Award',
                                                   'Training_score_average',
                                                   'No_of_previous_employers'])])),
                ('model',
                 RandomForestClassifier(class_weight='balanced',
                                        n_estimators=300, random_state=42))])
ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['Division', 'Qualification', 'Gender',
                                  'Channel_of_Recruitment', 'Foreign_schooled',
                                  'Past_Disciplinary_Action',
                                  'Previous_IntraDepartmental_Movement']),
                                ('num', StandardScaler(),
                                 ['Age', 'Trainings_Attended',
                                  'Last_performance_score',
                                  'Year_of_recruitment', 'Targets_met',
                                  'Previous_Award', 'Training_score_average',
                                  'No_of_previous_employers'])])
['Division', 'Qualification', 'Gender', 'Channel_of_Recruitment', 'Foreign_schooled', 'Past_Disciplinary_Action', 'Previous_IntraDepartmental_Movement']
OneHotEncoder(handle_unknown='ignore')
['Age', 'Trainings_Attended', 'Last_performance_score', 'Year_of_recruitment', 'Targets_met', 'Previous_Award', 'Training_score_average', 'No_of_previous_employers']
StandardScaler()
RandomForestClassifier(class_weight='balanced', n_estimators=300,
                       random_state=42)
In [32]:
# accuracy and classification report
rf_model.fit(X_train, Y_train)

rf_preds = rf_model.predict(X_test)

print("\n=== RANDOM FOREST RESULTS ===")
print("Accuracy:", accuracy_score(Y_test, rf_preds))
print(classification_report(Y_test, rf_preds))
=== RANDOM FOREST RESULTS ===
Accuracy: 0.9323449571935686
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      8768
           1       0.84      0.25      0.38       810

    accuracy                           0.93      9578
   macro avg       0.89      0.62      0.67      9578
weighted avg       0.93      0.93      0.91      9578

In [1]:
import xgboost
from xgboost import XGBClassifier
In [4]:
xgb_model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", XGBClassifier(
        n_estimators=350,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.9,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric="logloss"
    ))
])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 1
----> 1 xgb_model = Pipeline(steps=[
      2     ("preprocess", preprocessor),
      3     ("model", XGBClassifier(
      4         n_estimators=350,
      5         learning_rate=0.1,
      6         max_depth=6,
      7         subsample=0.9,
      8         colsample_bytree=0.8,
      9         random_state=42,
     10         eval_metric="logloss"
     11     ))
     12 ])

NameError: name 'Pipeline' is not defined
In [5]:
import sys
print(sys.executable)
c:\Users\HP\AppData\Local\Python\pythoncore-3.14-64\python.exe
In [9]:
import sys
print(sys.executable)
c:\Users\HP\AppData\Local\Python\pythoncore-3.14-64\python.exe
In [8]:
!py -m pip install xgboost
Requirement already satisfied: xgboost in c:\users\hp\appdata\local\python\pythoncore-3.14-64\lib\site-packages (3.1.2)
Requirement already satisfied: numpy in c:\users\hp\appdata\local\python\pythoncore-3.14-64\lib\site-packages (from xgboost) (2.3.5)
Requirement already satisfied: scipy in c:\users\hp\appdata\local\python\pythoncore-3.14-64\lib\site-packages (from xgboost) (1.16.3)